# install.packages(c("ggplot2", "reshape", "mvtnorm", "numDeriv", "list"))
library(ggplot2)
library(reshape)
library(mvtnorm)
library(numDeriv)
library(list)

# Root directory of the replication material
workingDirectory <- "~/Dropbox (Personal)/Academic/The Statistical Analysis of Misreporting/ReplicationData"

# Directory to output figures
outputDirectory <- "~/Dropbox (Personal)/Academic/The Statistical Analysis of Misreporting/ReplicationData"

setwd(workingDirectory)
source("Functions.R")

# A custom ggplot theme
my.theme <- function(base_size = 11, base_family = "",
                     grid.x_colour = NA, grid.y_colour = NA,
                     grid.x_linetype = 1, grid.y_linetype = 1,
                     strip_colour = "grey15", strip_text = "black",
                     background_colour = "transparent",
                     tick_colour = "black",
                     borderless = 0, bordersize = 0.5) { 
  if(is.na(grid.x_colour)) grid.x_colour <- "transparent"
  if(is.na(grid.y_colour)) grid.y_colour <- "transparent"
  if(borderless == 2) {
    border <- theme(panel.border = element_blank(),
                    panel.background = element_blank(),
                    strip.background = element_blank())
  }

  else if(borderless == 1) {
    border <- theme(axis.line = element_line(colour = "black", size = 0.25),
                    panel.border = element_blank(),
                    panel.background = element_blank(),
                    strip.background = element_blank())
  }
  else if(borderless == 0) border <- theme()
    theme( 
      axis.text.x       = element_text(family = base_family, colour = "grey15", size = base_size, vjust = 1, lineheight = 0.9),
      axis.text.y       = element_text(family = base_family, colour = "grey15", size = base_size, hjust = 1, lineheight = 0.9),
      axis.ticks        = element_line(colour = tick_colour, size = 0.2),
      axis.ticks.length = unit(0.25, "lines"),
      legend.background = element_blank(),
      legend.key = element_blank(),
      legend.key.size = unit(0.6, "lines"),
      legend.text = element_text(family = base_family, size = base_size, face = "plain", lineheight = 1),
      legend.text.align = 0, 
      legend.title = element_text(family = base_family, size = base_size, face = "plain", hjust = 0),
      legend.title.align = 0,
      legend.position = "top",
      legend.direction = "horizontal",
      legend.justification = c(1, 1),
      panel.background = element_rect(fill = "white"),
      panel.border = element_rect(fill = "transparent", colour = "black", size = bordersize),
      panel.grid.major.x = element_line(colour = grid.x_colour, size = 0.15, linetype = grid.x_linetype),
      panel.grid.major.y = element_line(colour = grid.y_colour, size = 0.15, linetype = grid.y_linetype),
      panel.grid.minor = element_blank(),
      panel.spacing = unit(0.75, "lines"),
      strip.background = element_blank(),
      strip.text.x = element_text(family = base_family, size = base_size, face = "plain", colour = strip_text),
      strip.text.y = element_text(family = base_family, size = base_size, face = "plain", angle = -90, colour = strip_text),
      plot.background = element_rect(fill = background_colour, colour = background_colour),
      plot.title = element_text(hjust = 0.5, family = base_family, size = base_size * 1.1, vjust = 0, face = "bold")
    ) +
    border
}

A <- read.csv("ListGender.csv", stringsAsFactors = FALSE)

# Set factor levels
A$gender <-       factor(A$gender, levels = c("Man", "Woman"))
A$ageGroup <-     factor(A$ageGroup, levels = c("18-29", "30-39", "40-49", "50-64", "65+"))
A$education <-    factor(A$education, levels = c("High school or below", "College", "University degree"))
A$motherTongue <- factor(A$motherTongue, levels = c("English", "French", "Other language"))
A$region <-       factor(A$region, levels = c("Ontario", "Atlantic", "Quebec", "West"))

# Total completed surveys
nrow(A)

# APPENDIX: Proportion who fail screener (0 = incorrect answer)
prop.table(table(A$attentionCheck)) * 100

# APPENDIX: Is failure to pass the attention check different between treatment and control?
prop.table(table(A$listGenderTreatment, A$attentionCheck), 1)
chisq.test(table(A$listGenderTreatment, A$attentionCheck))

# Use subset of those who passed the screener question
A <- subset(A, attentionCheck == 1)

# APPENDIX: Obvious violations of monotonicity assumption
# Recall that answering ZERO to the direct question indicates the sensitive response
table(A$listGender[A$listGenderTreatment == 1] == 5 & A$directGender[A$listGenderTreatment == 1] == 0)
prop.table(table(A$listGender[A$listGenderTreatment == 1] == 5 & A$directGender[A$listGenderTreatment == 1] == 0))

# Use subset of those who do not clearly violate monotonicity
# (99.99% of respondents)
A <- subset(A, !(A$listGender == 5 & A$directGender == 0))

# APPENDIX: Test for differences in responses to the control items
# by treatment assignment
ict.test(A$listGender, A$listGenderTreatment, J = 4)

# APPENDIX: Among those who provide the sensitive response to the direct
# question (D_i = 0), the difference in the means of Y for the control and
# treatment groups should be equivalent: by monotonicity and no lying, in
# the list experiment respondents are expected to also answer 0 to the
# sensitive item.
t.test(A$listGender[A$directGender == 0 & A$listGenderTreatment == 0],
       A$listGender[A$directGender == 0 & A$listGenderTreatment == 1])

# APPENDIX: Test whether the response to the direct question is affected by
# treatment assignment (i.e. having previously answered/not answered the
# sensitive item in list experiment)
chisq.test(table(A$directGender, A$listGenderTreatment))

# FIGURE 4: Graph list experiment versus direct question proportions
G <- data.frame(group = c("List\nexperiment", "Direct\nquestion"),
                value = c(weighted.mean(A$listGender[A$listGenderTreatment == 1], A$weight[A$listGenderTreatment == 1]) - weighted.mean(A$listGender[A$listGenderTreatment == 0], A$weight[A$listGenderTreatment == 0]),
                          weighted.mean(A$directGender, A$weight)) * 100)

pdf(paste0(outputDirectory, "/EADY-FIGURE4-ListDirect-raw.pdf"), 2.75, 2.5)
ggplot(G, aes(x = group, y = value, fill = group)) +
  my.theme(borderless = 2, base_size = 9) +
  coord_cartesian(ylim = c(0, 125), xlim = c(0.5, 2.5), expand = FALSE) +
  labs(x = "", y = "Proportion that agree", title = "Women are as competent\nas men in politics") +
  scale_y_continuous(breaks = c()) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.50), width = 0.50, colour = "black", size = 0.25) +
  geom_text(aes(y = value + 12, label = round(value, 1)), size = 3.3) +
  geom_hline(yintercept = 0, size = 0.75) +
  scale_fill_manual(values = c("white", "black")) +
  theme(legend.position = "none",
        plot.margin = unit(c(0, 0.25, -0.25, 0.25), "lines"))
dev.off()

# MODEL 1: Model without sub-model for misreporting
set.seed(1)
model.1 <- listExperiment(listGender ~ selfPlacement +
                                       gender +
                                       ageGroup +
                                       education +
                                       motherTongue +
                                       region,
                          data = A, J = 4, n.runs = 5,
                          treatment = "listGenderTreatment")
# saveRDS(model.1, "Model1.rds")   # Save fitted model object
# model.1 <- readRDS("Model1.rds") # Load fitted model object

# MODEL 2: Model including misreport sub-model
set.seed(2)
model.2 <- listExperiment(listGender ~ selfPlacement +
                                       gender +
                                       ageGroup +
                                       education +
                                       motherTongue +
                                       region,
                          data = A, J = 4, n.runs = 5,
                          treatment = "listGenderTreatment",
                          direct = "directGender",
                          sensitive.response = 0)
# saveRDS(model.2, "Model2.rds")   # Save fitted model object
# model.2 <- readRDS("Model2.rds") # Load fitted model object

##### TABLE 4
# Model 1 sensitive-item sub-model
summary(model.1, model = "sensitive", intercept = "bottom", show.p = FALSE)

# Model 2 sensitive-item sub-model
summary(model.2, model = "sensitive", intercept = "bottom", show.p = FALSE)

# Model 2 misreport sub-model
summary(model.2, model = "misreport", intercept = "bottom", show.p = FALSE)


##### Table A3
# Model 1 control-items sub-model
summary(model.1, model = "control", intercept = "bottom", show.p = FALSE)

# Model 2 control-items sub-model
summary(model.2, model = "control", intercept = "bottom", show.p = FALSE)


# Simluate predicted probabilities of misreporting
sims <- 5000

# Simulate coefficients
set.seed(3)
model.2.coef <- c(model.2$par.control, model.2$par.sens, model.2$par.misreport)
par.sim <- rmvnorm(sims, model.2.coef, model.2$vcov.mle)

# Coefficients for sensitive-item sub-model
par.sim.sensitive <- par.sim[, (length(model.2$par.control) + 1):(length(model.2.coef) - length(model.2$par.misreport))]
par.sim.sensitive <- apply(par.sim.sensitive, 1, function(x) list(x))

# Coefficients for misreport sub-model
par.sim.misreport <- par.sim[, (length(model.2.coef) - length(model.2$par.misreport) + 1):length(model.2.coef)]
par.sim.misreport <- apply(par.sim.misreport, 1, function(x) list(x))

# Model matrices with predictors for the sensitive-item and misreport sub-models
A.Sim.Sensitive <- data.frame(model.matrix(model.2$formula.sensitive, data = A))
A.Sim.Misreport <- data.frame(model.matrix(model.2$formula.misreport, data = A), treat = 0)

# Posterior predicted probabilities of each respondent holding the sensitive belief
A$pppSens <- exp(apply(model.2$w[, 1:2], 1, function(X) logAdd(X[1], X[2])))
A$pppSens[is.nan(A$pppSens)] <- 0

# Get mean predicted probabilities for the specified model and value of variable ("selfPlacement")
getSim <- function(variable, value, par.sim, data, weight) {
  data[, variable] <- value
  pp <- sapply(par.sim, function(coefs) weighted.mean(plogis(as.matrix(data) %*% unlist(coefs)), weight))
  return(list(mean(pp), quantile(pp, 0.025), quantile(pp, 0.975)))
}

pp <- low <- high <- as.numeric(NA)
value <- seq(0, 10, by = 0.5)
G.Sensitive <- data.frame(pp, low, high, value, outcome = "Agreement that women are as\ncompetent as men in politics")
G.Misreport <- data.frame(pp, low, high, value, outcome = "Misreport belief that women\n are not as competent as men in politics")

# Calculate predicted probabilities for each value of selfPlacement
for(i in 1:length(value)) {
  cat("\r", length(value) - i + 1, " ")

  sim.out.sensitive <- getSim("selfPlacement", G.Sensitive$value[i], par.sim.sensitive, A.Sim.Sensitive, A$weight)
  sim.out.misreport <- getSim("selfPlacement", G.Misreport$value[i], par.sim.misreport, A.Sim.Misreport, A$weight * A$pppSens)

  G.Sensitive$pp[i] <- sim.out.sensitive[[1]]
  G.Sensitive$low[i] <- sim.out.sensitive[[2]]
  G.Sensitive$high[i] <- sim.out.sensitive[[3]]

  G.Misreport$pp[i] <- sim.out.misreport[[1]]
  G.Misreport$low[i] <- sim.out.misreport[[2]]
  G.Misreport$high[i] <- sim.out.misreport[[3]]
}

# FIGURE 5 (LEFT PANEL)
pdf(paste0(outputDirectory, "/EADY-FIGURE5A-SensitivePP-raw.pdf"), 3.25, 2.75)
ggplot(G.Sensitive, aes(x = value, y = pp, ymin = low, ymax = high)) +
  my.theme(strip_colour = "white", bordersize = 1, base_size = 10) +
  coord_cartesian(x = c(-0.04, 10.04), y = c(-0.004, 1.004), expand = FALSE) +
  scale_x_continuous(breaks = 0:10, labels = c("0\n(Right)", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10\n(Left)")) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.25), labels = c("0", "0.25", "0.5", "0.75", "1")) +
  labs(title = "Belief that women are\nas competent as men in politics",
       x = "Ideology", y = "Pr(Belief in equality)", fill = "", colour = "", label = "") +
  geom_ribbon(fill = "grey94", color = "grey90", size = 0.25) +
  geom_line() +
  theme(plot.margin = unit(c(0.5, 1, 0.5, 0.25), "lines"))
dev.off()

# FIGURE 5 (RIGHT PANEL)
pdf(paste0(outputDirectory, "/EADY-FIGURE5B-MisreportPP-raw.pdf"), 3.25, 2.75)
ggplot(G.Misreport, aes(x = value, y = pp, ymin = low, ymax = high)) +
  my.theme(strip_colour = "white", bordersize = 1, base_size = 10) +
  coord_cartesian(x = c(-0.04, 10.04), y = c(-0.004, 1.004), expand = FALSE) +
  scale_x_continuous(breaks = 0:10, labels = c("0\n(Right)", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10\n(Left)")) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.25), labels = c("0", "0.25", "0.5", "0.75", "1")) +
  labs(title = "Misreport belief that women are\nnot as competent as men in politics",
       x = "Ideology", y = "Pr(Misreport)", fill = "", colour = "", label = "") +
  geom_ribbon(fill = "grey94", color = "grey90", size = 0.25) +
  geom_line() +
  theme(plot.margin = unit(c(0.5, 1, 0.5, 0.25), "lines"))
dev.off()

# DIFFERENCES IN PREDICTED PROBABILITIES AT VALUE OF SELF-PLACEMENT
G.Sensitive$pp[G.Sensitive$value == 0]
G.Sensitive$pp[G.Sensitive$value == 10]
G.Sensitive$pp[G.Sensitive$value == 10] - G.Sensitive$pp[G.Sensitive$value == 0]

G.Misreport$pp[G.Misreport$value == 0]
G.Misreport$pp[G.Misreport$value == 10]
G.Misreport$pp[G.Misreport$value == 10] - G.Misreport$pp[G.Misreport$value == 0]
